import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv('vehicle-1.csv')
data = df.copy()
df.head()
df.columns
df.shape
df.info()
df.isnull().sum()
In this case it is the Target column i.e. Class Column
TARGET = "class"
df[TARGET].value_counts()
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df[TARGET] = labelencoder.fit_transform(df[TARGET])
df[TARGET].value_counts()
Since the number of missing values in not too large. We will replace the missing values with the MEDIAN of the column
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='median', verbose=1)
#fill missing values with mean column values
transformed_values = imputer.fit_transform(df)
column = df.columns
df = pd.DataFrame(transformed_values, columns = column )
print("Data after treating missing value : ")
df.isnull().sum()
df.describe().transpose()
sns.pairplot(df, diag_kind='kde')
#Method to show Distribution & Box plot for the variable along with skewness
def showPlots(df, col):
fig,(ax1,ax2) = plt.subplots(nrows=1,ncols=2)
fig.set_size_inches(10,2)
sns.distplot(df[col],ax=ax1)
ax1.set_title("Distribution Plot")
sns.boxplot(df[col],ax=ax2)
ax2.set_title("Box Plot")
print(df[col].skew())
showPlots(df, 'compactness')
showPlots(df, 'circularity')
showPlots(df, 'distance_circularity')
showPlots(df, 'radius_ratio')
showPlots(df, 'pr.axis_aspect_ratio')
showPlots(df, 'max.length_aspect_ratio')
showPlots(df, 'scatter_ratio')
showPlots(df, 'elongatedness')
showPlots(df, 'pr.axis_rectangularity')
showPlots(df, 'max.length_rectangularity')
showPlots(df, 'scaled_variance')
showPlots(df, 'scaled_variance.1')
showPlots(df, 'scaled_radius_of_gyration')
showPlots(df, 'scaled_radius_of_gyration.1')
showPlots(df, 'skewness_about')
showPlots(df, 'skewness_about.1')
showPlots(df, 'skewness_about.2')
showPlots(df, 'hollows_ratio')
From the above graphs, it can be seen that the columns radius_ratio, pr.axis_aspect_ratio, max.length_aspect_ratio, scaled_variance, scaled_variance.1, scaled_radius_of_gyration.1, skewness_about, skewness_about.1 have outliers. Therefore, we will treat outliers of these columns before proceeding further.
#We will handle the outliers using IQR for all the columns
from scipy.stats import iqr
def handleOutliers(odf):
Q1 = odf.quantile(0.25)
Q3 = odf.quantile(0.75)
IQR = Q3 - Q1
cleandf = odf[~((odf < (Q1 - 1.5 * IQR)) | (odf > (Q3 + 1.5 * IQR))).any(axis=1)]
print(cleandf.shape)
return cleandf
df = handleOutliers(df)
df.hist(column=TARGET)
corr = df.corr()
# select only the lower triangle of the correlation matrix
lower_triangle = np.tril(corr)
# to mask the upper triangle in the following heatmap
mask = lower_triangle == 0
plt.figure(figsize = (25,14))
sns.set(font_scale=1.8)
# Setting it to white so that we do not see the grid lines
sns.set_style(style = 'white')
sns.heatmap(lower_triangle, center=0.5, cmap= 'coolwarm', annot= True, xticklabels = corr.index, yticklabels = corr.columns,
cbar= False, mask = mask, linecolor='white', vmax=.8, fmt='.2f',linewidths=0.01)
#Logic to manage the known issue with Matplotlib version 3.1.11 w.r.t. Heat Map
b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values
plt.show()
sns.set(font_scale=1)
Few important observations are
Few important observations are
Few important observations are
We are only considering a couple of most popular approaches used for resolving multicollinearity from the dataset
processed_df = df
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from prettytable import PrettyTable
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
SPLIT_VALUE = 0.30
SEED = 1501
# Function to split Target Variable from data
def SplitData(d):
#Set of Independent Variables
X=d.drop(TARGET, axis=1)
#Dependent Variable
y=d[TARGET]
return X,y
# Function to Scale data
def ScaleData(X):
scaler = preprocessing.StandardScaler()
return scaler.fit_transform(X)
# Function to split data into training & test set
def SplitTrainTestData(X, y):
Xtrain, Xtest, ytrain, ytest = train_test_split (X, y, test_size=SPLIT_VALUE, stratify=y, random_state=SEED)
print("Training Data Shape: {0}".format(Xtrain.shape))
print("Testing Data Shape: {0}".format(Xtest.shape))
return Xtrain, Xtest, ytrain, ytest
# Function to fit the model
def ModelFit (model, Xtr, ytr):
fit = model.fit(Xtr, ytr)
print(fit)
return model
# Function to predict from model
def ModelPredict (model, Xtt, ytt):
pred = model.predict(Xtt)
acc_scr = accuracy_score(ytt, pred)
return pred, acc_scr
# Function to print the Results of the model
def PrintResults(model, pred, Xtr, Xtt, ytr, ytt):
x = PrettyTable()
x.field_names = ["Metrics", "Results"]
x.add_row(["Classification Report", classification_report(ytt, pred)])
x.add_row(["Accuracy Score", accuracy_score(ytt, pred)])
x.add_row(["",""])
x.add_row(["Confusion Matrix", confusion_matrix(ytt, pred)])
x.add_row(["",""])
x.add_row(["Training Data Score", model.score(Xtr, ytr)])
x.add_row(["",""])
x.add_row(["Testing Data Score", model.score(Xtt, ytt)])
print(x)
def SVCTuneHyperParams(svc, Xtr, ytr):
svc = SVC()
Cs = [0.1, 1, 10, 100]
gammas = [0.01, 0.1, 1, 10]
kernel = ['linear', 'rbf', 'poly']
param = dict(kernel = kernel, C = Cs, gamma = gammas)
gs = GridSearchCV(svc, param, cv=3, scoring='accuracy', n_jobs = -1)
gs.fit(Xtr, ytr)
svc_bestScore = gs.best_score_
svc_bestParam = gs.best_params_
#Creating new model with best Parameters and running on the data again
k = svc_bestParam['kernel']
C = svc_bestParam['C']
g = svc_bestParam['gamma']
svc = SVC(kernel = k, C=C, gamma =g, probability=True)
svc.fit(Xtr, ytr)
x = PrettyTable()
x.field_names = ["Hyper Tuning", "Results"]
x.add_row(["Best Accuracy", svc_bestScore])
x.add_row(["",""])
x.add_row(["Best Parameter", svc_bestParam])
print(x)
return svc
from sklearn import model_selection
SPLITS = 10
def KFoldCrossValidation (name, model, X, y, scoring):
kfold = model_selection.KFold(n_splits=SPLITS)
results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
print(results)
mean, std = results.mean(), results.std()
x = PrettyTable()
x.field_names = ["Cross Validation", "Score Mean", "Score Standard Deviation"]
x.add_row([name, mean, std])
print(x)
return mean, std
# A single Function to execute all the steps for Model Evaluation
def ModelEvaluation(X, y, name, model, hyperTuneFunc, scoring):
# Creating Training & Test Set
Xtrain, Xtest, ytrain, ytest = SplitTrainTestData(X, y)
# Standardize X Data
Xtrain = ScaleData(Xtrain)
Xtest = ScaleData(Xtest)
# Model Training/Fitting
model = ModelFit(model, Xtrain, ytrain)
# Get Model Prediction & Accuracy Score
pred, scr = ModelPredict(model, Xtest, ytest)
# Results of model
PrintResults(model, pred, Xtrain, Xtest, ytrain, ytest)
# Perform Cross Validation
mean, std = KFoldCrossValidation(name, model, X, y, scoring)
# The code will tune the Hyper Parameter to retrieve the optimal value for the Model.
hy_model = hyperTuneFunc(model, Xtrain, ytrain)
# Get Model Prediction & Accuracy Score
hy_pred, hy_scr = ModelPredict(hy_model, Xtest, ytest)
# Results of model
PrintResults(hy_model, hy_pred, Xtrain, Xtest, ytrain, ytest)
return scr, mean, hy_scr
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
# Dropping the highly correlated columns from dataset
df = processed_df
droppedCols = ["max.length_rectangularity", "scaled_radius_of_gyration", "skewness_about.2",
"scatter_ratio", "elongatedness", "pr.axis_rectangularity", "scaled_variance", "scaled_variance.1"]
df1 = df.drop(droppedCols, axis=1)
# Splitting Data - Extract Target Column
X, y = SplitData(df1)
name = "Approach 1 - Drop Features"
model = SVC(kernel='linear', probability=True)
scoring = 'accuracy'
scr_a1, scr_cv_a1, scr_a1_hy = ModelEvaluation(X, y, name, model, SVCTuneHyperParams, scoring)
from sklearn.decomposition import PCA
# Function to perform PCA analysis based on supplied parameters
def PCAFit(X, component):
# Standardize X Data
X = ScaleData(X)
# PCA technique implementation
pca = PCA(n_components = component)
pca.fit(X)
# Print Results
PrintPCAResults(pca)
return pca, X
# Function to print the results of PCA Fit
def PrintPCAResults(m):
x = PrettyTable()
x.field_names = ["PCA Analysis", "Result"]
x.add_row(["Eigen Values", m.explained_variance_])
x.add_row(["",""])
x.add_row(["Eigen Vectors", m.components_])
x.add_row(["",""])
x.add_row(["Variation Ratio", m.explained_variance_ratio_])
print(x)
def PCATransform(X, component):
#Perform PCA Fit
pca, X = PCAFit(X, component)
return pca.transform(X)
df = processed_df
# Splitting Original Data - Extract Target Column
X, y = SplitData(df)
name = "Approach 2 - Original Data"
model = SVC(kernel='linear', probability=True)
scoring = 'accuracy'
scr_a2, scr_cv_a2, scr_a2_hy = ModelEvaluation(X, y, name, model, SVCTuneHyperParams, scoring)
As a start point we will perform PCA Fit considering the components are same as the number of columns in the dataset
df = processed_df
# Splitting Original Data - Extract Target Column
X, y = SplitData(df)
pca, X = PCAFit(df, len(df.columns) - 1)
# Plotting the variance expalained by the principal components and the cumulative variance explained.
expVar = pca.explained_variance_ratio_
length = len(pca.explained_variance_ratio_) + 1
cExpVar = np.cumsum(pca.explained_variance_ratio_)
plt.figure(figsize=(10 , 5))
plt.bar(range(1, length), expVar, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, length), cExpVar, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
df = processed_df
# Splitting Original Data - Extract Target Column
X, y = SplitData(df)
pcadf = PCATransform(X, 7)
Showing the pair plot of the PCA Tranform Data with only 7 Principal Components
pca = pd.DataFrame(pcadf)
sns.pairplot(pca, diag_kind='kde')
name = "Approach 2 - PCA Data"
model = SVC(kernel='linear', probability=True)
scoring = 'accuracy'
scr_a2_pca, scr_cv_a2_pca, scr_a2_hy_pca = ModelEvaluation(pca, y, name, model, SVCTuneHyperParams, scoring)
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Model Name", "Accuracy Score", "Cross Validation Score", "Accuracy Score - Hyper Tuning"]
x.add_row(["A1: Approach 1 - Drop Features", scr_a1, scr_cv_a1, scr_a1_hy])
x.add_row(["A2: Approach 2 - Original Data", scr_a2, scr_cv_a2, scr_a2_hy])
x.add_row(["A2-PCA: Approach 2 - PCA Data", scr_a2_pca, scr_cv_a2_pca, scr_a2_hy_pca])
print(x)